source(here::here("code/load.R"))

dat <- read_csv(here("data/estimates.csv"))
survey_dat <- readRDS(here("data/survey_clean.rds"))


# Main plot
main_plot <- function(fn, metrics, omit = NULL) {
    fn <- here::here(sprintf("fig/power_plot%s.png", fn))
    power <- seq(0.06, .99, .01)
    get_share <- function(x) sapply(power, function(p) mean(x > p, na.rm = TRUE))

    if (!is.null(omit)) {
        dat_plot <- dat |> filter(meta_id != {{omit}})
    } else {
        dat_plot <- dat
    }

    dat_plot <- lapply(dat_plot[, metrics], get_share) |>
        data.frame() |>
        setNames(names(metrics)) |>
        mutate(power = power) |>
        pivot_longer(cols = seq_along(metrics)) |>
        # use factor to sort colors in the legend
        arrange(power, value) |>
        mutate(name = factor(name, name[length(metrics):1]),
               value = value * 100,
               power = power * 100)

    if (!is.null(omit)) {
        dat_plot$meta_id <- omit
        return(dat_plot)
    }

    p <- ggplot() +
      #strip plots
      geom_boxplot(survey_dat,
                   mapping = aes(x = 50, y = power_50),
                   width = 5,
                   outlier.shape = NA,
                   alpha = 0,
                   coef = 0) +
      geom_jitter(survey_dat,
                  mapping = aes(x = 50, y = power_50),
                  width = 1,
                  alpha = 0.5) +
      geom_boxplot(survey_dat,
                   mapping = aes(x = 80, y = power_80),
                   width = 5,
                   outlier.shape = NA,
                   alpha = 0,
                   coef = 0) +
      geom_jitter(survey_dat,
                  mapping = aes(x = 80, y = power_80),
                  width = 1,
                  alpha = 0.5) +
      geom_line(dat_plot, mapping = aes(x = power, y = value, color = name)) +
      scale_x_continuous(breaks = seq(0, 100, 20)) +
      scale_y_continuous(breaks = seq(0, 100, 20), limits = c(0, 100)) +
      theme(legend.title = element_blank()) +
      scale_color_manual(values = discrete_palette) +
      labs(x = "Power level", y = "Share of all estimates")
    ggsave(p, filename = fn, width = 6, height = 4)
}

main_plot(
    fn <- "",
    metrics = c(
        "PET-PEESE" = "power_petpeese",
        "UWLS" = "power_uwls",
        "WAAP-UWLS" = "power_waapuwls"))

main_plot(
  fn <- "_1tail",
  metrics = c(
    "PET-PEESE" = "power_petpeese_1tail",
    "UWLS" = "power_uwls_1tail",
    "WAAP-UWLS" = "power_waapuwls_1tail"))

main_plot(
    fn <- "_5x",
    metrics = c(
        "PET-PEESE" = "power_petpeese_5x",
        "UWLS" = "power_uwls_5x",
        "WAAP-UWLS" = "power_waapuwls_5x"))

main_plot(
    fn <- "_outliers",
    metrics = c(
        "UWLS: IQR on coefficients" = "power_uwls_tukeyB",
        "UWLS: DFBeta" = "power_uwls_dfbeta",
        "UWLS: IQR on standard errors" = "power_uwls_tukeySE"))

# LOGO
dat_plot <- map_dfr(
    unique(dat$meta_id), function(x) 
    main_plot(fn = "", metrics = c("UWLS" = "power_uwls"), omit = x))
p <- ggplot(dat_plot, aes(power, value, level = meta_id)) +
    geom_line(size = .1, alpha = 0.5) +
    scale_x_continuous(breaks = seq(0, 100, 20)) +
    scale_y_continuous(breaks = seq(0, 100, 20), limits = c(0, 100)) +
    scale_color_discrete(guide = "none") +
    labs(x = "Power level", y = "Share of all estimates")
ggsave(p, filename = here("fig/robust_logo.png"), width = 6, height = 4)


# Distribution of absolute z statistics
# 5x doesn't make sense here
plot_z <- function(drop_outliers = FALSE) {
    binwidth <- 0.05
    if (isTRUE(drop_outliers)) {
        x <- dat |> filter(outlier_tukeyB == FALSE &
                           outlier_tukeySE == FALSE)
        fn <- here("fig/zstat_distribution_outliers.png")
    } else {
        x <- dat
        fn <- here("fig/zstat_distribution.png")
    }
    x |>
      mutate(z_stat_abs = abs(z_stat)) |>
      filter(z_stat_abs < 10) |>
      mutate(n = n()) |>
      ggplot(aes(x = z_stat_abs)) +
      geom_vline(xintercept = abs(qnorm (0.005)), color = "red", alpha = 0.75, linetype = 2) +
      geom_vline(xintercept = abs(qnorm(0.025)), color = "red", alpha = 0.75, linetype = 2) +
      # geom_vline(xintercept = abs(qnorm(0.05)), color = "red", alpha = 0.75) +
      annotate("text", x = abs(qnorm(0.025)) - .3, y = 350, label = "95%", size = 3) +
      annotate("text", x = abs(qnorm(0.005)) + .3, y = 350, label = "99%", size = 3) +
      geom_histogram(binwidth = binwidth, boundary = 0, alpha = 0.75) +
      #scaling factor for density * total obs * bin width
      # geom_density(aes(y = ..density.. * n * binwidth)) +
      scale_x_continuous(breaks = seq(0, 10, 1)) +
      labs(x = "|z|", y = "Number of estimates", fill = "") +
      theme(panel.grid = element_blank()) 
      ggsave(filename = fn, width = 6, height = 4)
}
plot_z(drop_outliers = FALSE)
plot_z(drop_outliers = TRUE)

# # What fraction of questions pass a Precision Effect Test (PET) at p < 0.1?
# dat |>
#   filter(truth_method_alt == "all") |>
#   group_by(pet_genuine_all, question_id) |>
#   summarize(pet_genuine = mean(pet_genuine_all)) |>
#   summarise(n = n())



# Number of estimates per research question
p <- dat |>
  count(question_id) |>
  ggplot(aes(x = n)) +
  geom_histogram(bins = 30) +
  scale_x_continuous(trans = "log10") +
  labs(x = "Number of estimates per research question (log10)")
ggsave(p, filename = here("fig/n_per_question.png"), width = 6, height = 4)

# mean and median power over time
label_interval <- function(breaks) {
  paste0(breaks[1:length(breaks) - 1], "-", breaks[2:length(breaks)])
}

breaks <- seq(1990, 2020, 5)

p <- dat |>
  filter (study_year > 1989) |>
  mutate(year_5 = cut(study_year,
                      breaks = breaks,
                      labels = label_interval(breaks))) |>
  group_by(year_5) |>
  summarize(uwls_Median = median(power_uwls) * 100,
            uwls_Mean = mean (power_uwls) * 100,
            n = n()) |>
  filter(!is.na(year_5)) |>
  pivot_longer(starts_with("uwls"),
               names_to = "measure",
               names_prefix = "uwls_",
               values_to = "power") |>
  ggplot(aes(x = year_5, y = power, fill = measure)) +
  geom_col(position = "dodge") +
  labs(fill = "",
       x = "Year bins",
       y = "Power level") + 
  scale_fill_manual(values = discrete_palette)
ggsave(p, filename = here("fig/power_bins.png"), width = 6, height = 4)

# Reshape to plot
power_long <- dat |>
    select(question_id, study_year, power_petpeese, power_uwls, power_waapuwls) |>
    pivot_longer(col = starts_with("power")) |>
    mutate(name = case_when(
        name == "power_petpeese" ~ "PET-PEESE",
        name == "power_uwls" ~ "UWLS",
        name == "power_waapuwls" ~ "WAAP-UWLS"),
        name = factor(name, levels = c("UWLS", "WAAP-UWLS", "PET-PEESE")))

typeS_long <- dat |>
    select(question_id, typeS_petpeese, typeS_uwls, typeS_waapuwls) |>
    pivot_longer(col = 2:4) |>
    mutate(name = case_when(
        name == "typeS_petpeese" ~ "PET-PEESE",
        name == "typeS_uwls" ~ "UWLS",
        name == "typeS_waapuwls" ~ "WAAP-UWLS"),
        name = factor(name, levels = c("UWLS", "WAAP-UWLS", "PET-PEESE")))

typeM_long <- dat |>
    select(question_id, typeM_petpeese, typeM_uwls, typeM_waapuwls) |>
    pivot_longer(col = 2:4) |>
    mutate(name = case_when(
        name == "typeM_petpeese" ~ "PET-PEESE",
        name == "typeM_uwls" ~ "UWLS",
        name == "typeM_waapuwls" ~ "WAAP-UWLS"),
        name = factor(name, levels = c("UWLS", "WAAP-UWLS", "PET-PEESE"))) |>
    filter(!value %in% c(Inf, -Inf, NA, NaN), abs(value) < 1e2)

# Power over study_year
tmp <- power_long |>
    filter(study_year >= 1990) |>
    group_by(name, study_year) |>
    summarize(value = mean(value) * 100)

p <- power_long |>
  filter(study_year >= 1990) |>
  mutate(name = factor(name)) |>
  ggplot(aes(x = study_year, y = value * 100)) +
  stat_bin2d() +
  scale_fill_viridis_c(trans = "log10") +
  geom_smooth(color = "red", se = F) +
  # geom_line(data = tmp, aes(study_year, value), size = 1, color = "pink") +
  facet_wrap(~name) +
  labs(fill = "n estimates", x = "Year of study", y = "Power level")
ggsave(p, filename = here("fig/power_time.png"), width = 8, height = 2)

#power over time for slides
p <- power_long |>
  filter(study_year >= 1990,
         name == "UWLS") |>
  mutate(name = factor(name)) |>
  ggplot(aes(x = study_year, y = value * 100)) +
  stat_bin2d() +
  scale_fill_viridis_c(trans = "log10") +
  geom_smooth(color = "red", se = F) +
  # geom_line(data = tmp, aes(study_year, value), size = 1, color = "pink") +
  labs(fill = "n estimates", x = "Year of study", y = "Power level")
ggsave(p, filename = here("fig/power_time_slides.png"), width = 6, height = 4)
# tmp <- power_long |>
#     filter(study_year > 1989, name == "UWLS") |>
#     group_by(name, study_year, question_id) |>
#     summarize(value = median(value)) |>
#     ungroup()

# library(ggExtra)
# p <- ggplot(tmp, aes(study_year, value)) +
#     geom_jitter(alpha = .3) +
#     geom_smooth() +
#     labs(fill = "n estimates", x = "Year of study", y = "Power level")
# p <- ggMarginal(p, type = "histogram")
# ggsave(p, filename = "~/Downloads/margins.png")


# Histograms of power and type S errors for all three types of truth estimates
p <- ggplot(power_long, aes(x = value)) +
    geom_histogram(bins = 30) +
    facet_wrap(~name) +
    labs(x = "Power", y = "")
ggsave(p, filename = here("fig/hist_power.png"), width = 7, height = 2)

p <- ggplot(typeS_long, aes(x = value)) +
  geom_histogram(bins = 30) +
  facet_wrap(~name) +
  labs(x = "Type S", y = "")
ggsave(p, filename = here("fig/hist_typeS.png"), width = 7, height = 2)

p <- ggplot(typeM_long, aes(x = value)) +
  geom_histogram(bins = 30) +
  facet_wrap(~name) +
  labs(x = "Type M", y = "")
ggsave(p, filename = here("fig/hist_typeM.png"), width = 7, height = 2)

# Histogram of median power per question_id (doing this by meta_id looks very similar)
p <- power_long |>
  group_by(question_id, name) |>
  summarize(median_power = median(value) * 100) |>
  ggplot(aes(x = median_power)) +
  geom_histogram(breaks = seq(0, 100, 5)) +
  scale_x_continuous(breaks = seq(0, 100, 20)) +
  facet_wrap(~name) +
  labs(x = "Median power per meta-analysis", y = "")
ggsave(p, filename = here("fig/median_power.png"), width = 7, height = 2)

#median power per question_id for slides
p <- power_long |>
  filter(name == "UWLS") |>
  group_by(question_id) |>
  summarize(median_power = median(value) * 100) |>
  ggplot(aes(x = median_power)) +
  geom_histogram(breaks = seq(0, 100, 5)) +
  scale_x_continuous(breaks = seq(0, 100, 20)) +
  labs(x = "Median power per meta-analysis", y = "")
ggsave(p, filename = here("fig/median_power_slides.png"), width = 6, height = 4)

# research inflation
p <- dat |>
  filter(abs(z_stat) >= qnorm(0.975),
         sign(estimate) == sign(uwls)) |>
  ggplot(aes(estimate / uwls)) +
  geom_histogram() + 
  scale_x_log10(breaks = 10^seq(-1, 5, 1),
                labels = scales::label_number(scale_cut = scales::cut_short_scale())) +
  labs(x = "Estimate divided by UWLS \"truth\" value", y = "")
ggsave(p, filename = here("fig/magnitude.png"), width = 6, height = 4)

  # Survey data analysis, do estimates in meta-analyses have less power?
p <- survey_dat |>
  ggplot(aes(x = fct_rev(bias))) +
  geom_histogram(stat = "count") +
  coord_flip() +
  labs(x = "", y = "Number of respondents")
ggsave(p, filename = here("fig/bias.png"), width = 6, height = 2)





# LOO: sign and magnitude
dat <- read_csv(here("data/estimates.csv"))

sm <- function(omit = "ArcNic2009") {
    tmp <- dat |>
        filter(meta_id != omit) |>
        mutate(wrongsign = sign(estimate) != sign(uwls))
    overestimation <- tmp |>
        filter(abs(z_stat) > qnorm(0.975),
            sign(estimate) == sign(uwls)) |>
        summarize(overestimation = median(estimate / uwls)) |>
        pull(overestimation)
    sig <- tmp |> filter(abs(z_stat) >= qnorm(0.975))
    sigsign <- sig |> filter(!wrongsign)
    wrongsign <- (nrow(sig) - nrow(sigsign)) / nrow(sig) * 100
    out <- data.frame(wrongsign, overestimation, meta_id = omit)
    return(out)
}



d1 <- dat |> 
  summarize(Power = median(power_uwls), .by = c("subfield", "question_id")) |>
  mutate("Estimate" = "Meta-analytic estimate") |>
  select(subfield, Power, Estimate)
d2 <- dat |> 
  summarize(Power = median(power_uwls), .by = c("subfield", "meta_id")) |>
  mutate("Estimate" = "Meta-analytic article") |>
  select(subfield, Power, Estimate)
d <- rbind(d1, d2)

p <- ggplot(d, aes(Power, linetype = Estimate)) +
  geom_density() +
  # facet_wrap(~subfield, scales = "free") +
  facet_wrap(~subfield) +
  labs(x = "Median power (UWLS estimate)", y = "", linetype = "") +
  theme(legend.position = "bottom")
ggsave(p, filename = here("fig/median_power_subfield.png"), width = 6, height = 4)

# # Histogram of median power per question_id, by subfield
# p <- dat |>
#   group_by(question_id, subfield) |>
#   summarize(median_power = median(power_uwls) * 100) |>
#   ggplot(aes(x = median_power)) +
#   geom_histogram(aes(y = after_stat(count / sum(count))), breaks = seq(0, 100, 5)) +
#   scale_y_continuous(labels = scales::percent_format()) +
#   scale_x_continuous(breaks = seq(0, 100, 20)) +
#   facet_wrap(~subfield) +
#   labs(x = "Median power per meta-analysis",
#        caption = "Unrestricted WLS was used to estimate the population mean effects")
# ggsave(p, filename = here("fig/median_power_subfield.png"), width = 6, height = 4)


# # Empirical and theoretical type M and type S
# dat <- dat |>
#     mutate(wrongsign = sign(estimate) != sign(uwls),
#            overestimation = (abs((estimate - uwls) / uwls) - 1) * 100)
# sig <- dat |>
#     filter(abs(z_stat) >= qnorm(0.975))
# sigsign <- sig |>
#     filter(!wrongsign)
#     
# overestimation <- sprintf("%.0f%%", median(sigsign$overestimation))
# wrongsign <- (nrow(sig) - nrow(sigsign)) / nrow(sig)
# wrongsign <- sprintf("%.0f%%", wrongsign * 100)
# mod1 <- glm(wrongsign ~ power_uwls, data = sig, family = binomial)
# comparisons(mod1, variables = list(power_uwls = 0.25)) |>
#     summary()
# quantreg::rq(overestimation ~ 1, data = sigsign) |>
#     summary()
# quantreg::rq(overestimation ~ I(power_uwls * 10), data = sigsign) |>
#     summary()
# models <- list(
#     "Logit" = mod1,
#     "Quantile I" = mod2,
#     "Quantile I" = mod3)
# modelsummary(models, "markdown", statistic = NULL)
